pkgs <- c(
  "foreign", "tidyverse", "stringr", "reshape2",
  "lubridate", "stm", "SnowballC", "tm",
  "tidytext"
)
bools <- pkgs %in% installed.packages()[,1]
to_install <- NA
if(mean(bools) != 1){
  to_install <- pkgs[!bools]
  for(i in 1:length(to_install)){
    install.packages(to_install[i],
                     dependencies = T)
  }
}
#load packages, remove objects
sapply(pkgs, require, character.only = T); rm(pkgs); rm(to_install); rm(bools)

data_new <- read_tsv("TJ_subset_dates_penal.tsv", local = locale(encoding = "UTF-8"))
data_new <- data_new[,-c(1, 2)]
docs <- data_new[['full_text']]
output <- textProcessor(docs, 
                        language = "es", 
                        customstopwords = c("votos", "pleno", "sala", 
                                            "unanimidad", "publicar", 
                                            "nombre", "ponente"),
                        lowercase = F
)

prepped <- prepDocuments(
  output$documents, output$vocab
)
i <- 1
stm_list <- list()
for(k in 3:6){
  stm_output <- stm(
    prepped$documents, prepped$vocab,
    K = k, seed = 12202019 + k
  )
  stm_list[[i]] <- stm_output
  i <- i + 1
}

ids <- data_new$label_id
date_inf <- data_new[,"year"]
meta <- cbind(ids, date_inf)
meta$date <- ymd(paste(meta$year, '1', '1', sep = '/'))
meta[is.na(meta$date),1]
i <- 1

which(is.na(meta$date))

# save(stm_list, file = "stm_paper_tortura.rdata")

for(i in 1:length(stm_list)){

    
  stm_output <- stm_list[[i]]
  out <- cbind(data_new$label_id, stm_output$theta)
  out <- out %>% data.frame
  colnames(out)[1] <- 'label_id'
  colnames(out)[2:ncol(out)] <- paste0('topic_', 1:(ncol(out) - 1))
  
  out$date <- meta$date
  out$top <- apply(stm_output$theta, 1, which.max)
  
  out %>% 
    group_by(year = year(date), top) %>% 
    summarise(count = n()) -> toPlot
  out %>% 
    group_by(year = year(date)) %>% 
    summarise(total = n()) %>% 
    full_join(toPlot) -> toPlot
  toPlot %>% 
    group_by(top, year) %>% 
    summarise(rate = count/total) %>% 
    ggplot(aes(
      x = year, y = rate,
      colour = factor(top)
    )) + ggtitle(paste0("Stm: ", as.character(i + 2), " topics")) + 
    xlab("Year") + ylab("Proportion of decisions belonging to topic") +
    geom_point() + stat_smooth() + 
    labs(color = 'Topic\n') -> gplt
  Filename <- paste0('~/topics_', as.character(2 + i), '.png')
  ggsave(filename = Filename, gplt)
  txt <- c()
  words <- labelTopics(stm_output, n = 10)$frex[,1:10]
  for(k in 1:nrow(words)){
    txt <- c(txt, paste(words[k,], collapse = ', '))
    txt <- paste(txt, collapse = '\n')
  }
  write_lines(
    txt,
    paste0('~/topics_', as.character(2 + i), '.txt')
  )
}




